1. Load and Pre-processing data¶
In [1]:
pip install plotly
Defaulting to user installation because normal site-packages is not writeable Requirement already satisfied: plotly in /home/yuexia/.local/lib/python3.8/site-packages (5.22.0) Requirement already satisfied: tenacity>=6.2.0 in /home/yuexia/.local/lib/python3.8/site-packages (from plotly) (8.5.0) Requirement already satisfied: packaging in /home/yuexia/.local/lib/python3.8/site-packages (from plotly) (23.2) [notice] A new release of pip is available: 24.0 -> 24.1.2 [notice] To update, run: python3 -m pip install --upgrade pip Note: you may need to restart the kernel to use updated packages.
In [2]:
import numpy as np, pandas as pd
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from datetime import date
from sklearn import preprocessing
from sklearn.mixture import GaussianMixture
from sklearn import metrics
from matplotlib import pyplot as plt
import warnings
warnings.filterwarnings('ignore')
In [3]:
# load the data and show the first five rows info
data = pd.read_csv("marketing_campaign.csv",delimiter='\t')
data.shape # (2240, 29)
data.head().style.background_gradient(cmap='Pastel2')
Out[3]:
| ID | Year_Birth | Education | Marital_Status | Income | Kidhome | Teenhome | Dt_Customer | Recency | MntWines | MntFruits | MntMeatProducts | MntFishProducts | MntSweetProducts | MntGoldProds | NumDealsPurchases | NumWebPurchases | NumCatalogPurchases | NumStorePurchases | NumWebVisitsMonth | AcceptedCmp3 | AcceptedCmp4 | AcceptedCmp5 | AcceptedCmp1 | AcceptedCmp2 | Complain | Z_CostContact | Z_Revenue | Response | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5524 | 1957 | Graduation | Single | 58138.000000 | 0 | 0 | 04-09-2012 | 58 | 635 | 88 | 546 | 172 | 88 | 88 | 3 | 8 | 10 | 4 | 7 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 1 |
| 1 | 2174 | 1954 | Graduation | Single | 46344.000000 | 1 | 1 | 08-03-2014 | 38 | 11 | 1 | 6 | 2 | 1 | 6 | 2 | 1 | 1 | 2 | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 |
| 2 | 4141 | 1965 | Graduation | Together | 71613.000000 | 0 | 0 | 21-08-2013 | 26 | 426 | 49 | 127 | 111 | 21 | 42 | 1 | 8 | 2 | 10 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 |
| 3 | 6182 | 1984 | Graduation | Together | 26646.000000 | 1 | 0 | 10-02-2014 | 26 | 11 | 4 | 20 | 10 | 3 | 5 | 2 | 2 | 0 | 4 | 6 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 |
| 4 | 5324 | 1981 | PhD | Married | 58293.000000 | 1 | 0 | 19-01-2014 | 94 | 173 | 43 | 118 | 46 | 27 | 15 | 5 | 5 | 3 | 6 | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 |
In [4]:
# change data type with some of the column
data["Dt_Customer"] = pd.to_datetime(data['Dt_Customer'], dayfirst=True,format = '%d-%m-%Y')
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2240 entries, 0 to 2239 Data columns (total 29 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ID 2240 non-null int64 1 Year_Birth 2240 non-null int64 2 Education 2240 non-null object 3 Marital_Status 2240 non-null object 4 Income 2216 non-null float64 5 Kidhome 2240 non-null int64 6 Teenhome 2240 non-null int64 7 Dt_Customer 2240 non-null datetime64[ns] 8 Recency 2240 non-null int64 9 MntWines 2240 non-null int64 10 MntFruits 2240 non-null int64 11 MntMeatProducts 2240 non-null int64 12 MntFishProducts 2240 non-null int64 13 MntSweetProducts 2240 non-null int64 14 MntGoldProds 2240 non-null int64 15 NumDealsPurchases 2240 non-null int64 16 NumWebPurchases 2240 non-null int64 17 NumCatalogPurchases 2240 non-null int64 18 NumStorePurchases 2240 non-null int64 19 NumWebVisitsMonth 2240 non-null int64 20 AcceptedCmp3 2240 non-null int64 21 AcceptedCmp4 2240 non-null int64 22 AcceptedCmp5 2240 non-null int64 23 AcceptedCmp1 2240 non-null int64 24 AcceptedCmp2 2240 non-null int64 25 Complain 2240 non-null int64 26 Z_CostContact 2240 non-null int64 27 Z_Revenue 2240 non-null int64 28 Response 2240 non-null int64 dtypes: datetime64[ns](1), float64(1), int64(25), object(2) memory usage: 507.6+ KB
In [5]:
# deal with missing values - drop
# data.isnull().sum() # income: 24
data=data.dropna(subset=['Income'])
data.isnull().sum()
Out[5]:
ID 0 Year_Birth 0 Education 0 Marital_Status 0 Income 0 Kidhome 0 Teenhome 0 Dt_Customer 0 Recency 0 MntWines 0 MntFruits 0 MntMeatProducts 0 MntFishProducts 0 MntSweetProducts 0 MntGoldProds 0 NumDealsPurchases 0 NumWebPurchases 0 NumCatalogPurchases 0 NumStorePurchases 0 NumWebVisitsMonth 0 AcceptedCmp3 0 AcceptedCmp4 0 AcceptedCmp5 0 AcceptedCmp1 0 AcceptedCmp2 0 Complain 0 Z_CostContact 0 Z_Revenue 0 Response 0 dtype: int64
In [6]:
print(data.Education.unique())
print(data.Marital_Status.unique())
data.columns
['Graduation' 'PhD' 'Master' 'Basic' '2n Cycle'] ['Single' 'Together' 'Married' 'Divorced' 'Widow' 'Alone' 'Absurd' 'YOLO']
Out[6]:
Index(['ID', 'Year_Birth', 'Education', 'Marital_Status', 'Income', 'Kidhome',
'Teenhome', 'Dt_Customer', 'Recency', 'MntWines', 'MntFruits',
'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts',
'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases',
'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth',
'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1',
'AcceptedCmp2', 'Complain', 'Z_CostContact', 'Z_Revenue', 'Response'],
dtype='object')
In [7]:
# dates of the newest and oldest recorded customer
print("The newest customer's enrolment time:",max(data["Dt_Customer"]))
print("The oldest customer's enrolment time:",min(data["Dt_Customer"]))
The newest customer's enrolment time: 2014-06-29 00:00:00 The oldest customer's enrolment time: 2012-07-30 00:00:00
In [8]:
# combining similarity information columns, then dropping irrelevant columns, or creating a new feature based on existing features
data.Education = data.Education.replace(['PhD','2n Cycle','Graduation', 'Master'],'Post Graduate')
data.Education = data.Education.replace(['Basic'], 'Under Graduate')
data.Marital_Status = data.Marital_Status.replace(['Widow','Alone','Absurd','YOLO'],"Single")
data.Marital_Status = data.Marital_Status.replace(['Together','Married','Divorced'],"Relationship")
data["Kids"] = data.Kidhome + data.Teenhome
data["All_Cmp"] = data.AcceptedCmp1 + data.AcceptedCmp2 + data.AcceptedCmp3 + data.AcceptedCmp4 + data.AcceptedCmp5
data["Age"] = 2023 - data.Year_Birth #current year 2023.
data['Spending']=data.MntWines+data.MntFruits+data.MntMeatProducts+data.MntFishProducts+data.MntSweetProducts+data.MntGoldProds
# the last date of the entire csv
last_date=date(2014,6,29)
# the number of months that a customer has stayed
data['StayMonths'] = pd.to_numeric(data['Dt_Customer'].dt.date.apply(lambda x: (last_date - x)).dt.days, downcast='integer')/30
data = data.drop(["ID","Z_CostContact","Z_Revenue","Kidhome","Teenhome","AcceptedCmp1","AcceptedCmp2","AcceptedCmp3",
"AcceptedCmp4","AcceptedCmp5","Year_Birth","Dt_Customer"],axis=1)
data.head().style.background_gradient(cmap='Pastel2')
Out[8]:
| Education | Marital_Status | Income | Recency | MntWines | MntFruits | MntMeatProducts | MntFishProducts | MntSweetProducts | MntGoldProds | NumDealsPurchases | NumWebPurchases | NumCatalogPurchases | NumStorePurchases | NumWebVisitsMonth | Complain | Response | Kids | All_Cmp | Age | Spending | StayMonths | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Post Graduate | Single | 58138.000000 | 58 | 635 | 88 | 546 | 172 | 88 | 88 | 3 | 8 | 10 | 4 | 7 | 0 | 1 | 0 | 0 | 66 | 1617 | 22.100000 |
| 1 | Post Graduate | Single | 46344.000000 | 38 | 11 | 1 | 6 | 2 | 1 | 6 | 2 | 1 | 1 | 2 | 5 | 0 | 0 | 2 | 0 | 69 | 27 | 3.766667 |
| 2 | Post Graduate | Relationship | 71613.000000 | 26 | 426 | 49 | 127 | 111 | 21 | 42 | 1 | 8 | 2 | 10 | 4 | 0 | 0 | 0 | 0 | 58 | 776 | 10.400000 |
| 3 | Post Graduate | Relationship | 26646.000000 | 26 | 11 | 4 | 20 | 10 | 3 | 5 | 2 | 2 | 0 | 4 | 6 | 0 | 0 | 1 | 0 | 39 | 53 | 4.633333 |
| 4 | Post Graduate | Relationship | 58293.000000 | 94 | 173 | 43 | 118 | 46 | 27 | 15 | 5 | 5 | 3 | 6 | 5 | 0 | 0 | 1 | 0 | 42 | 422 | 5.366667 |
2. Exploratory data and visualisation¶
In [9]:
# plot1
fig, axes = plt.subplots(1, 3, figsize=(10, 4))
sns.countplot(x='Education', data=data, palette='Set2', ax=axes[0])
axes[0].set_title('Frequency of Education Situation')
sns.countplot(x='Marital_Status',data=data,ax=axes[1])
axes[1].set_title('Marital Status')
sns.countplot(x='Kids',data=data,palette='Set3',ax=axes[2])
plt.title("Frequency Of Each Category\n in the Kids Variable")
plt.tight_layout()
plt.show()
Findings:¶
- Education: I reset 'PhD', '2n Cycle',' Graduation', and 'Master' to "Post Graduate", and the "basic" is "Under Graduate". This dataset contains most of the postgraduate customers, over 2,000. Whereas just a few undergraduates, even though they are less than 250.
- Marital Status: I reset 'Widow', 'Alone', 'Absurd', and 'YOLO' as "Single" variables; the other is "Relationship". The people quantity is the difference around 1,000.
- Kids: I created a new column named "Kids", which concatenates "Kidhome" and "Teenhome". From the plot, we can see around 600 customers do not have kids; the highest number of people have only one kid, and only a few have more than three kids.
In [10]:
# plot2 average income in box plot
plt.figure(figsize=(6,4))
data.Income.plot.box(color ='turquoise')
plt.show()
Finding:¶
according to plot2, the box plot shows the income values contain outliers, which are above $600,000, so let's remove them
In [11]:
# deal with outliers
data = data[data["Income"] < 600000]
data.describe().style.background_gradient(cmap='Pastel2')
Out[11]:
| Income | Recency | MntWines | MntFruits | MntMeatProducts | MntFishProducts | MntSweetProducts | MntGoldProds | NumDealsPurchases | NumWebPurchases | NumCatalogPurchases | NumStorePurchases | NumWebVisitsMonth | Complain | Response | Kids | All_Cmp | Age | Spending | StayMonths | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 |
| mean | 51969.861400 | 49.024379 | 305.225282 | 26.361625 | 167.063205 | 37.651016 | 27.040632 | 43.979684 | 2.322799 | 4.085779 | 2.671783 | 5.802257 | 5.318736 | 0.009481 | 0.150339 | 0.947178 | 0.298420 | 54.183296 | 607.321445 | 11.783461 |
| std | 21526.320095 | 28.949608 | 337.345380 | 39.802036 | 224.311559 | 54.760822 | 41.077594 | 51.822660 | 1.923820 | 2.741473 | 2.927179 | 3.250974 | 2.425863 | 0.096929 | 0.357484 | 0.749230 | 0.679332 | 11.987000 | 602.925291 | 6.749291 |
| min | 1730.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 27.000000 | 5.000000 | 0.000000 |
| 25% | 35284.000000 | 24.000000 | 24.000000 | 2.000000 | 16.000000 | 3.000000 | 1.000000 | 9.000000 | 1.000000 | 2.000000 | 0.000000 | 3.000000 | 3.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 46.000000 | 69.000000 | 6.000000 |
| 50% | 51373.000000 | 49.000000 | 175.000000 | 8.000000 | 68.000000 | 12.000000 | 8.000000 | 25.000000 | 2.000000 | 4.000000 | 2.000000 | 5.000000 | 6.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 53.000000 | 397.000000 | 11.833333 |
| 75% | 68487.000000 | 74.000000 | 505.000000 | 33.000000 | 232.500000 | 50.000000 | 33.000000 | 56.000000 | 3.000000 | 6.000000 | 4.000000 | 8.000000 | 7.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 64.000000 | 1048.000000 | 17.633333 |
| max | 162397.000000 | 99.000000 | 1493.000000 | 199.000000 | 1725.000000 | 259.000000 | 262.000000 | 321.000000 | 15.000000 | 27.000000 | 28.000000 | 13.000000 | 20.000000 | 1.000000 | 1.000000 | 3.000000 | 4.000000 | 130.000000 | 2525.000000 | 23.300000 |
In [12]:
# plot3 average income
fig, axes = plt.subplots(1, 2, figsize=(10, 4))
sns.distplot(data.Income,color ='green',ax=axes[0])
sns.distplot(data.Spending,color ='orange', ax=axes[1])
plt.tight_layout()
plt.show()
Finding:¶
- after removing the outliers, the income variable appears bell curve. It is for later GMM method.
- Spending variable comes from the customers spending money on wine, fruits, meat, fish, sweets, and gold. It looks like skewed data.
In [13]:
# correlation matrix
# Selecting only numeric columns from the DataFrame
numeric_data = data.select_dtypes(include=['number'])
# Computing the correlation matrix
corrmat = numeric_data.corr()
# Plotting the heatmap
plt.figure(figsize=(18,18))
sns.heatmap(corrmat, annot=True, center=0, cmap="Set2")
plt.show()
3. Data Normalisation¶
Take 3 primany features ('Income','StayMonths','Spending') from the dataset for GMM clustering.
In [14]:
data_new = data[['Income','StayMonths','Spending']]
# standardise the numerical variables
column= data_new.columns
scaler = preprocessing.StandardScaler()
data_new_standard= scaler.fit_transform(data_new)
data_new_normalized = preprocessing.normalize(data_new_standard,norm='l2')
data_new_normalized
Out[14]:
array([[ 0.12537976, 0.66883499, 0.73276168],
[-0.16850324, -0.76582868, -0.6205748 ],
[ 0.93476116, -0.2099754 , 0.28658664],
...,
[ 0.15988431, -0.6733312 , 0.72184631],
[ 0.60693959, -0.73771711, 0.29563121],
[ 0.02765358, 0.87791651, -0.47801453]])
Identify the number of clusters using Silhouette Score¶
In [15]:
# silhouette method
scores = []
for i in range(2, 15):
GMM=GaussianMixture(n_components=i,random_state=42)
clusters = GMM.fit_predict(data_new_normalized)
scores.append(metrics.silhouette_score(data_new_normalized, clusters, metric='euclidean'))
plt.figure(figsize=(8,4))
plt.plot(range(2, 15), scores, 'bo-', color='purple')
plt.xlabel('clusters')
plt.ylabel('Silhouette Score')
plt.title('Identify the number of clusters using Silhouette Score')
plt.show()
Finding:¶
- The n_components = 4 is highest silhouette score, which is 0.47. So, I will plot the 4 clusters in next step.
- The n_components = 2 has silhouette score 0.46.
- Others n components I do not consider.
4. Clustering Algorithm - GaussianMixture¶
In [16]:
# apply GaussianMixture clustering method with n_components=4
GMM=GaussianMixture(n_components=4,random_state=42).fit(data_new_normalized)
cluster_predicted = GMM.predict(data_new_normalized)
# plot
plt.figure(figsize=(6,6))
plt.scatter(data_new_normalized[:, 0], data_new_normalized[:, 1], c=cluster_predicted, cmap='Set2');
Finding:¶
The plot shows that the data have been nicely grouped into four groups.
In [17]:
# added cluster_predicted value into data_new data frame
data_new["Cluster"] = cluster_predicted
data_new.head().style.background_gradient(cmap='Pastel2')
Out[17]:
| Income | StayMonths | Spending | Cluster | |
|---|---|---|---|---|
| 0 | 58138.000000 | 22.100000 | 1617 | 1 |
| 1 | 46344.000000 | 3.766667 | 27 | 0 |
| 2 | 71613.000000 | 10.400000 | 776 | 3 |
| 3 | 26646.000000 | 4.633333 | 53 | 0 |
| 4 | 58293.000000 | 5.366667 | 422 | 3 |
4. Summary¶
In [18]:
summary = data_new.groupby('Cluster').agg({'mean'})
# Visualisasi
plt.figure(figsize=(15, 4))
for index, variable in enumerate(data_new.columns):
if index > 2:
break
plt.subplot(1,3,index+1)
sns.barplot(x = summary.reset_index().Cluster, y = summary[variable]['mean'],palette='Set2')
plt.ylabel(variable, fontsize=10)
plt.xlabel('Cluster', fontsize=10)
plt.show()
Insights¶
The dataset has following 4 clusters, which are 0,1,2 and 3.
- Cluster 0: new customers, low income, small spending ("Attention")
- Cluster 1: new customers, high income, high spending ("Target Customer")
- Cluster 2: old customers, low income, small spending ("Not interested")
- Cluster 3: old customers, high income, high spending ("Loyal High Spending")
Visualisation for Insights¶
In [19]:
# Visualisation
data_new=data_new.replace({0:'Attention',1:'Target Customer',2:'Not interested',3:'Loyal High Spending'})
PLOT = go.Figure()
colors = ['orange', 'pink', 'skyblue', 'lightgrey']
color_index = 0
for C in list(data_new.Cluster.unique()):
PLOT.add_trace(go.Scatter3d(x = data_new[data_new.Cluster == C]['Income'],
y = data_new[data_new.Cluster == C]['StayMonths'],
z = data_new[data_new.Cluster == C]['Spending'],
mode = 'markers',marker_size = 6, marker_line_width = 1,
marker_color=colors[color_index],
name = str(C)))
color_index += 1
PLOT.update_traces(hovertemplate='Income: %{x} <br>StayMonths: %{y} <br>Spending: %{z}')
PLOT.update_layout(width = 850, height = 850, autosize = True, showlegend = True,
scene = dict(xaxis=dict(title = 'Income', titlefont_color = 'black'),
yaxis=dict(title = 'StayMonths', titlefont_color = 'black'),
zaxis=dict(title = 'Spending', titlefont_color = 'black')),
font = dict(family = "Gilroy", color = 'black', size = 12))
In [ ]:
In [ ]:
In [ ]:
In [ ]: